@article{jonas2019cloud,
  title={Cloud programming simplified: A berkeley view on serverless computing},
  author={Jonas, Eric and Schleier-Smith, Johann and Sreekanti, Vikram and Tsai, Chia-Che and Khandelwal, Anurag and Pu, Qifan and Shankar, Vaishaal and Carreira, Joao and Krauth, Karl and Yadwadkar, Neeraja and others},
  journal={arXiv preprint arXiv:1902.03383},
  year={2019}
}

@incollection{baldini2017serverless,
  title={Serverless computing: Current trends and open problems},
  author={Baldini, Ioana and Castro, Paul and Chang, Kerry and Cheng, Perry and Fink, Stephen and Ishakian, Vatche and Mitchell, Nick and Muthusamy, Vinod and Rabbah, Rodric and Slominski, Aleksander and others},
  booktitle={Research Advances in Cloud Computing},
  pages={1--20},
  year={2017},
  publisher={Springer}
}

@article{mkldnn1,
  title={Distributed deep learning using synchronous stochastic gradient descent},
  author={Das, Dipankar and Avancha, Sasikanth and Mudigere, Dheevatsa and Vaidynathan, Karthikeyan and Sridharan, Srinivas and Kalamkar, Dhiraj and Kaul, Bharat and Dubey, Pradeep},
  journal={arXiv preprint arXiv:1602.06709},
  year={2016}
}

@inproceedings{megdet,
  title={Megdet: A large mini-batch object detector},
  author={Peng, Chao and Xiao, Tete and Li, Zeming and Jiang, Yuning and Zhang, Xiangyu and Jia, Kai and Yu, Gang and Sun, Jian},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={6181--6189},
  year={2018}
}

@inproceedings{cnn1,
  title={Imagenet classification with deep convolutional neural networks},
  author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
  booktitle={Advances in neural information processing systems},
  pages={1097--1105},
  year={2012}
}

@article{howard2017mobilenets,
  title={Mobilenets: Efficient convolutional neural networks for mobile vision applications},
  author={Howard, Andrew G and Zhu, Menglong and Chen, Bo and Kalenichenko, Dmitry and Wang, Weijun and Weyand, Tobias and Andreetto, Marco and Adam, Hartwig},
  journal={arXiv preprint arXiv:1704.04861},
  year={2017}
}

@inproceedings{cnn2,
  title={Going deeper with convolutions},
  author={Szegedy, Christian and Liu, Wei and Jia, Yangqing and Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich, Andrew},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1--9},
  year={2015}
}

@article{gap1,
  title={On large-batch training for deep learning: Generalization gap and sharp minima},
  author={Keskar, Nitish Shirish and Mudigere, Dheevatsa and Nocedal, Jorge and Smelyanskiy, Mikhail and Tang, Ping Tak Peter},
  journal={arXiv preprint arXiv:1609.04836},
  year={2016}
}


@inproceedings{tvm,
  title={TVM: An automated end-to-end optimizing compiler for deep learning},
  author={Chen, Tianqi and Moreau, Thierry and Jiang, Ziheng and Zheng, Lianmin and Yan, Eddie and Shen, Haichen and Cowan, Meghan and Wang, Leyuan and Hu, Yuwei and Ceze, Luis},
  booktitle={13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)},
  pages={578--594},
  year={2018}
}

@article{large1,
  title={Accurate, large minibatch sgd: Training imagenet in 1 hour},
  author={Goyal, Priya and Doll{\'a}r, Piotr and Girshick, Ross and Noordhuis, Pieter and Wesolowski, Lukasz and Kyrola, Aapo and Tulloch, Andrew and Jia, Yangqing and He, Kaiming},
  journal={arXiv preprint arXiv:1706.02677},
  year={2017}
}

@article{DBLP:journals/corr/abs-1802-05799,
  author    = {Alexander Sergeev and
               Mike Del Balso},
  title     = {Horovod: fast and easy distributed deep learning in TensorFlow},
  journal   = {CoRR},
  volume    = {abs/1802.05799},
  year      = {2018},
  url       = {http://arxiv.org/abs/1802.05799},
  archivePrefix = {arXiv},
  eprint    = {1802.05799},
  timestamp = {Mon, 13 Aug 2018 16:46:12 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1802-05799},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/cacm/KrizhevskySH17,
  author    = {Alex Krizhevsky and
               Ilya Sutskever and
               Geoffrey E. Hinton},
  title     = {ImageNet classification with deep convolutional neural networks},
  journal   = {Commun. {ACM}},
  volume    = {60},
  number    = {6},
  pages     = {84--90},
  year      = {2017},
  url       = {http://doi.acm.org/10.1145/3065386},
  doi       = {10.1145/3065386},
  timestamp = {Sun, 02 Jun 2019 20:48:58 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/cacm/KrizhevskySH17},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{large2,
  title={Imagenet training in minutes},
  author={You, Yang and Zhang, Zhao and Hsieh, Cho-Jui and Demmel, James and Keutzer, Kurt},
  booktitle={Proceedings of the 47th International Conference on Parallel Processing},
  pages={1},
  year={2018},
  organization={ACM}
}

@inproceedings{ps,
  title={Scaling distributed machine learning with the parameter server},
  author={Li, Mu and Andersen, David G and Park, Jun Woo and Smola, Alexander J and Ahmed, Amr and Josifovski, Vanja and Long, James and Shekita, Eugene J and Su, Bor-Yiing},
  booktitle={11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14)},
  pages={583--598},
  year={2014}
}

@inproceedings{asy1,
  title={Asynchrony begets momentum, with an application to deep learning},
  author={Mitliagkas, Ioannis and Zhang, Ce and Hadjis, Stefan and R{\'e}, Christopher},
  booktitle={2016 54th Annual Allerton Conference on Communication, Control, and Computing (Allerton)},
  pages={997--1004},
  year={2016},
  organization={IEEE}
}

@inproceedings{asy2,
  title={Hogwild: A lock-free approach to parallelizing stochastic gradient descent},
  author={Recht, Benjamin and Re, Christopher and Wright, Stephen and Niu, Feng},
  booktitle={Advances in neural information processing systems},
  pages={693--701},
  year={2011}
}

@inproceedings{DBLP:conf/hpca/HazelwoodBBCDDF18,
  author    = {Kim M. Hazelwood and
               Sarah Bird and
               David M. Brooks and
               Soumith Chintala and
               Utku Diril and
               Dmytro Dzhulgakov and
               Mohamed Fawzy and
               Bill Jia and
               Yangqing Jia and
               Aditya Kalro and
               James Law and
               Kevin Lee and
               Jason Lu and
               Pieter Noordhuis and
               Misha Smelyanskiy and
               Liang Xiong and
               Xiaodong Wang},
  title     = {Applied Machine Learning at Facebook: {A} Datacenter Infrastructure
               Perspective},
  booktitle = {{IEEE} International Symposium on High Performance Computer Architecture,
               {HPCA} 2018, Vienna, Austria, February 24-28, 2018},
  pages     = {620--629},
  year      = {2018},
}

@article{mxnet,
  title={Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems},
  author={Chen, Tianqi and Li, Mu and Li, Yutian and Lin, Min and Wang, Naiyan and Wang, Minjie and Xiao, Tianjun and Xu, Bing and Zhang, Chiyuan and Zhang, Zheng},
  journal={arXiv preprint arXiv:1512.01274},
  year={2015}
}

@inproceedings{datap2,
  title={Large scale distributed deep networks},
  author={Dean, Jeffrey and Corrado, Greg and Monga, Rajat and Chen, Kai and Devin, Matthieu and Mao, Mark and Senior, Andrew and Tucker, Paul and Yang, Ke and Le, Quoc V and others},
  booktitle={Advances in neural information processing systems},
  pages={1223--1231},
  year={2012}
}

@article{datap1,
  title={Revisiting distributed synchronous SGD},
  author={Chen, Jianmin and Pan, Xinghao and Monga, Rajat and Bengio, Samy and Jozefowicz, Rafal},
  journal={arXiv preprint arXiv:1604.00981},
  year={2016}
}

@inproceedings{modelp,
  title={Deep learning with COTS HPC systems},
  author={Coates, Adam and Huval, Brody and Wang, Tao and Wu, David and Catanzaro, Bryan and Andrew, Ng},
  booktitle={International conference on machine learning},
  pages={1337--1345},
  year={2013}
}

@article{mobilenet,
  title={Mobilenets: Efficient convolutional neural networks for mobile vision applications},
  author={Howard, Andrew G and Zhu, Menglong and Chen, Bo and Kalenichenko, Dmitry and Wang, Weijun and Weyand, Tobias and Andreetto, Marco and Adam, Hartwig},
  journal={arXiv preprint arXiv:1704.04861},
  year={2017}
}

@incollection{sgd,
  title={Large-scale machine learning with stochastic gradient descent},
  author={Bottou, L{\'e}on},
  booktitle={Proceedings of COMPSTAT'2010},
  pages={177--186},
  year={2010},
  publisher={Springer}
}

@inproceedings{resnet,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@article{multicores1,
  title={Amdahl's law in the multicore era},
  author={Hill, Mark D and Marty, Michael R},
  journal={Computer},
  volume={41},
  number={7},
  pages={33--38},
  year={2008},
  publisher={IEEE}
}

@inproceedings{multicores2,
  title={Map-reduce for machine learning on multicore},
  author={Chu, Cheng-Tao and Kim, Sang K and Lin, Yi-An and Yu, YuanYuan and Bradski, Gary and Olukotun, Kunle and Ng, Andrew Y},
  booktitle={Advances in neural information processing systems},
  pages={281--288},
  year={2007}
}

@inproceedings{imagenet,
  title={Imagenet: A large-scale hierarchical image database},
  author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li},
  booktitle={2009 IEEE conference on computer vision and pattern recognition},
  pages={248--255},
  year={2009},
  organization={Ieee}
}

@misc{mkldnn,
  howpublished = {\url{https://github.com/intel/mkl-dnn}},
  title = {Intel(r) math kernel library for deep neural networks (intel(r) mkl-dnn)}
}

@inproceedings{tf,
  title={Tensorflow: A system for large-scale machine learning},
  author={Abadi, Mart{\'\i}n and Barham, Paul and Chen, Jianmin and Chen, Zhifeng and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and Ghemawat, Sanjay and Irving, Geoffrey and Isard, Michael and others},
  booktitle={12th $\{$USENIX$\}$ Symposium on Operating Systems Design and Implementation ($\{$OSDI$\}$ 16)},
  pages={265--283},
  year={2016}
}

@inproceedings{caffe,
  title={Caffe: Convolutional architecture for fast feature embedding},
  author={Jia, Yangqing and Shelhamer, Evan and Donahue, Jeff and Karayev, Sergey and Long, Jonathan and Girshick, Ross and Guadarrama, Sergio and Darrell, Trevor},
  booktitle={Proceedings of the 22nd ACM international conference on Multimedia},
  pages={675--678},
  year={2014},
  organization={ACM}
}

@article{lstm,
  title={Long short-term memory},
  author={Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
  journal={Neural computation},
  volume={9},
  number={8},
  pages={1735--1780},
  year={1997},
  publisher={MIT Press}
}

@article{cnn3,
  title={Very deep convolutional networks for large-scale image recognition},
  author={Simonyan, Karen and Zisserman, Andrew},
  journal={arXiv preprint arXiv:1409.1556},
  year={2014}
}

@book{nlp1,
  title={Foundations of statistical natural language processing},
  author={Manning, Christopher D and Manning, Christopher D and Sch{\"u}tze, Hinrich},
  year={1999},
  publisher={MIT press}
}

@inproceedings{nlp2,
  title={Speech recognition with deep recurrent neural networks},
  author={Graves, Alex and Mohamed, Abdel-rahman and Hinton, Geoffrey},
  booktitle={2013 IEEE international conference on acoustics, speech and signal processing},
  pages={6645--6649},
  year={2013},
  organization={IEEE}
}

@inproceedings{nlp3,
  title={Recurrent neural network based language model},
  author={Mikolov, Tom{\'a}{\v{s}} and Karafi{\'a}t, Martin and Burget, Luk{\'a}{\v{s}} and {\v{C}}ernock{\`y}, Jan and Khudanpur, Sanjeev},
  booktitle={Eleventh annual conference of the international speech communication association},
  year={2010}
}

@article{nlp4,
  title={A learning algorithm for continually running fully recurrent neural networks},
  author={Williams, Ronald J and Zipser, David},
  journal={Neural computation},
  volume={1},
  number={2},
  pages={270--280},
  year={1989},
  publisher={MIT Press}
}

@article{recommender2,
  title={Matrix factorization techniques for recommender systems},
  author={Koren, Yehuda and Bell, Robert and Volinsky, Chris},
  journal={Computer},
  number={8},
  pages={30--37},
  year={2009},
  publisher={IEEE}
}

@article{recommender1,
  title={Evaluating collaborative filtering recommender systems},
  author={Herlocker, Jonathan L and Konstan, Joseph A and Terveen, Loren G and Riedl, John T},
  journal={ACM Transactions on Information Systems (TOIS)},
  volume={22},
  number={1},
  pages={5--53},
  year={2004},
  publisher={ACM}
}

@inproceedings{recommender3,
  title={Deep neural networks for youtube recommendations},
  author={Covington, Paul and Adams, Jay and Sargin, Emre},
  booktitle={Proceedings of the 10th ACM conference on recommender systems},
  pages={191--198},
  year={2016},
  organization={ACM}
}

@inproceedings{ncf,
  title={Neural collaborative filtering},
  author={He, Xiangnan and Liao, Lizi and Zhang, Hanwang and Nie, Liqiang and Hu, Xia and Chua, Tat-Seng},
  booktitle={Proceedings of the 26th international conference on world wide web},
  pages={173--182},
  year={2017},
  organization={International World Wide Web Conferences Steering Committee}
}

@article{cudnn,
  title={cudnn: Efficient primitives for deep learning},
  author={Chetlur, Sharan and Woolley, Cliff and Vandermersch, Philippe and Cohen, Jonathan and Tran, John and Catanzaro, Bryan and Shelhamer, Evan},
  journal={arXiv preprint arXiv:1410.0759},
  year={2014}
}

@article{bn,
  title={Batch normalization: Accelerating deep network training by reducing internal covariate shift},
  author={Ioffe, Sergey and Szegedy, Christian},
  journal={arXiv preprint arXiv:1502.03167},
  year={2015}
}

@article{memorybound,
  title={Moderately hard, memory-bound functions},
  author={Abadi, Martin and Burrows, Mike and Manasse, Mark and Wobber, Ted},
  journal={ACM Transactions on Internet Technology (TOIT)},
  volume={5},
  number={2},
  pages={299--327},
  year={2005},
  publisher={ACM}
}

@book{openmp,
  title={Parallel programming in OpenMP},
  author={Chandra, Rohit and Dagum, Leo and Kohr, David and Menon, Ramesh and Maydan, Dror and McDonald, Jeff},
  year={2001},
  publisher={Morgan kaufmann}
}

@book{openmp2,
  title={Using OpenMP: portable shared memory parallel programming},
  author={Chapman, Barbara and Jost, Gabriele and Van Der Pas, Ruud},
  volume={10},
  year={2008},
  publisher={MIT press}
}

@inproceedings{sync1,
  title={On parallelizability of stochastic gradient descent for speech dnns},
  author={Seide, Frank and Fu, Hao and Droppo, Jasha and Li, Gang and Yu, Dong},
  booktitle={2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={235--239},
  year={2014},
  organization={IEEE}
}

@inproceedings{sync2,
  title={Parallelized stochastic gradient descent},
  author={Zinkevich, Martin and Weimer, Markus and Li, Lihong and Smola, Alex J},
  booktitle={Advances in neural information processing systems},
  pages={2595--2603},
  year={2010}
}

@misc{numa,
  title={Operating system for a non-uniform memory access multiprocessor system},
  author={Kimmel, Jeffrey S and Alfieri, Robert A and Miles, A and McGrath, William K and McLeod, Michael J and O'connell, Mark A and Simpson, Guy A},
  year={2000},
  month=aug # "~15",
  publisher={Google Patents},
  note={US Patent 6,105,053}
}

@inproceedings{core-memory,
  title={Thousand core chipsa technology perspective},
  author={Borkar, Shekhar},
  booktitle={2007 44th ACM/IEEE Design Automation Conference},
  pages={746--749},
  year={2007},
  organization={IEEE}
}

@inproceedings{mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={4510--4520},
  year={2018}
}

@article{linear,
  title={One weird trick for parallelizing convolutional neural networks},
  author={Krizhevsky, Alex},
  journal={arXiv preprint arXiv:1404.5997},
  year={2014}
}

@inproceedings{gemm-size,
  title={A note on auto-tuning GEMM for GPUs},
  author={Li, Yinan and Dongarra, Jack and Tomov, Stanimire},
  booktitle={International Conference on Computational Science},
  pages={884--892},
  year={2009},
  organization={Springer}
}

@article{memwall,
  title={Hitting the memory wall: implications of the obvious},
  author={Wulf, Wm A and McKee, Sally A},
  journal={ACM SIGARCH computer architecture news},
  volume={23},
  number={1},
  pages={20--24},
  year={1995},
  publisher={ACM}
}

@inproceedings{int8,
  title={Quantization and training of neural networks for efficient integer-arithmetic-only inference},
  author={Jacob, Benoit and Kligys, Skirmantas and Chen, Bo and Zhu, Menglong and Tang, Matthew and Howard, Andrew and Adam, Hartwig and Kalenichenko, Dmitry},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={2704--2713},
  year={2018}
}

@inproceedings{xnornet,
  title={Xnor-net: Imagenet classification using binary convolutional neural networks},
  author={Rastegari, Mohammad and Ordonez, Vicente and Redmon, Joseph and Farhadi, Ali},
  booktitle={European Conference on Computer Vision},
  pages={525--542},
  year={2016},
  organization={Springer}
}

@article{twn,
  title={Ternary weight networks},
  author={Li, Fengfu and Zhang, Bo and Liu, Bin},
  journal={arXiv preprint arXiv:1605.04711},
  year={2016}
}

@inproceedings{bwn,
  title={Binaryconnect: Training deep neural networks with binary weights during propagations},
  author={Courbariaux, Matthieu and Bengio, Yoshua and David, Jean-Pierre},
  booktitle={Advances in neural information processing systems},
  pages={3123--3131},
  year={2015}
}

@article{glow,
  title={Glow: Graph lowering compiler techniques for neural networks},
  author={Rotem, Nadav and Fix, Jordan and Abdulrasool, Saleem and Catron, Garret and Deng, Summer and Dzhabarov, Roman and Gibson, Nick and Hegeman, James and Lele, Meghan and Levenstein, Roman and others},
  journal={arXiv preprint arXiv:1805.00907},
  year={2018}
}

@article{tensorcom,
  title={Tensor comprehensions: Framework-agnostic high-performance machine learning abstractions},
  author={Vasilache, Nicolas and Zinenko, Oleksandr and Theodoridis, Theodoros and Goyal, Priya and DeVito, Zachary and Moses, William S and Verdoolaege, Sven and Adams, Andrew and Cohen, Albert},
  journal={arXiv preprint arXiv:1802.04730},
  year={2018}
}

@article{gemmsize2,
  title={The design and performance of batched BLAS on modern high-performance computing systems},
  author={Dongarra, Jack and Hammarling, Sven and Higham, Nicholas J and Relton, Samuel D and Valero-Lara, Pedro and Zounon, Mawussi},
  journal={Procedia Computer Science},
  volume={108},
  pages={495--504},
  year={2017},
  publisher={Elsevier}
}

@inproceedings{gemmsize3,
  title={Tensor contractions with extended blas kernels on cpu and gpu},
  author={Shi, Yang and Niranjan, Uma Naresh and Anandkumar, Animashree and Cecka, Cris},
  booktitle={2016 IEEE 23rd International Conference on High Performance Computing (HiPC)},
  pages={193--202},
  year={2016},
  organization={IEEE}
}

@inproceedings{WMT,
  title={Edinburgh’s phrase-based machine translation systems for WMT-14},
  author={Durrani, Nadir and Haddow, Barry and Koehn, Philipp and Heafield, Kenneth},
  booktitle={Proceedings of the Ninth Workshop on Statistical Machine Translation},
  pages={97--104},
  year={2014}
}

@article{sharedmem,
  title={Shared memory consistency models: A tutorial},
  author={Adve, Sarita V and Gharachorloo, Kourosh},
  journal={computer},
  volume={29},
  number={12},
  pages={66--76},
  year={1996},
  publisher={IEEE}
}

@article{blocks,
  title={High-Performance Deep Learning via a Single Building Block},
  author={Georganas, Evangelos and Banerjee, Kunal and Kalamkar, Dhiraj and Avancha, Sasikanth and Venkat, Anand and Anderson, Michael and Henry, Greg and Pabst, Hans and Heinecke, Alexander},
  journal={arXiv preprint arXiv:1906.06440},
  year={2019}
}

@article{movielens,
  title={https://grouplens.org/datasets/movielens/20m/},
  journal={online},
}

@article{cifar10,
  title={https://www.cs.toronto.edu/~kriz/cifar.html},
  journal={online},
}

@article{sherlock,
  title={https://www.kaggle.com/idevji1/sherlock-holmes-stories},
  journal={online},
}

@article{parax,
  title={https://github.com/anonymous-nicer/parax}
}

@article{avx512,
  title={https://software.intel.com/en-us/articles/intel-avx-512-instructions},
  journal={online},
}

@article{aws,
  title={https://aws.amazon.com/ec2/spot},
  journal={online},
}

@article{mic,
  title={https://azure.microsoft.com/en-us/free/virtual-machines},
  journal={online},
}

@article{gle,
  title={https://cloud.google.com/preemptible-vms},
  journal={online},
}

@article{simt,
  title={CUDASW++ 2.0: enhanced Smith-Waterman protein database search on CUDA-enabled GPUs based on SIMT and virtualized SIMD abstractions},
  author={Liu, Yongchao and Schmidt, Bertil and Maskell, Douglas L},
  journal={BMC research notes},
  volume={3},
  number={1},
  pages={93},
  year={2010},
  publisher={BioMed Central}
}

@inproceedings{ddr4,
  title={Understanding and mitigating refresh overheads in high-density DDR4 DRAM systems},
  author={Mukundan, Janani and Hunter, Hillery and Kim, Kyu-hyoun and Stuecheli, Jeffrey and Mart{\'\i}nez, Jos{\'e} F},
  booktitle={ACM SIGARCH Computer Architecture News},
  volume={41},
  number={3},
  pages={48--59},
  year={2013},
  organization={ACM}
}

@article{gnmt,
  title={Google's neural machine translation system: Bridging the gap between human and machine translation},
  author={Wu, Yonghui and Schuster, Mike and Chen, Zhifeng and Le, Quoc V and Norouzi, Mohammad and Macherey, Wolfgang and Krikun, Maxim and Cao, Yuan and Gao, Qin and Macherey, Klaus and others},
  journal={arXiv preprint arXiv:1609.08144},
  year={2016}
}

@article{staleness,
  title={Staleness-aware async-sgd for distributed deep learning},
  author={Zhang, Wei and Gupta, Suyog and Lian, Xiangru and Liu, Ji},
  journal={arXiv preprint arXiv:1511.05950},
  year={2015}
}

@inproceedings{variance,
  title={On variance reduction in stochastic gradient descent and its asynchronous variants},
  author={Reddi, Sashank J and Hefny, Ahmed and Sra, Suvrit and Poczos, Barnabas and Smola, Alexander J},
  booktitle={Advances in neural information processing systems},
  pages={2647--2655},
  year={2015}
}

@inproceedings{asynchronous,
  title={Asynchronous stochastic gradient descent with delay compensation},
  author={Zheng, Shuxin and Meng, Qi and Wang, Taifeng and Chen, Wei and Yu, Nenghai and Ma, Zhi-Ming and Liu, Tie-Yan},
  booktitle={International Conference on Machine Learning},
  pages={4120--4129},
  year={2017}
}

@inproceedings{large_scale,
  title={Large scale distributed deep networks},
  author={Dean, Jeffrey and Corrado, Greg and Monga, Rajat and Chen, Kai and Devin, Matthieu and Mao, Mark and Ranzato, Marc'aurelio and Senior, Andrew and Tucker, Paul and Yang, Ke and others},
  booktitle={Advances in neural information processing systems},
  pages={1223--1231},
  year={2012}
}

@article{revisiting,
  title={Revisiting distributed synchronous SGD},
  author={Chen, Jianmin and Pan, Xinghao and Monga, Rajat and Bengio, Samy and Jozefowicz, Rafal},
  journal={arXiv preprint arXiv:1604.00981},
  year={2016}
}

@article{tianqi,
  title={Training Deep Nets with Sublinear Memory Cost},
  author={Chen, Tianqi and Xu, Bing and Zhang, Chiyuan and Guestrin, Carlos},
  journal={arXiv preprint arXiv:1604.06174},
  year={2016}
}

@article{lars,
  title={Scaling sgd batch size to 32k for imagenet training},
  author={You, Yang and Gitman, Igor and Ginsburg, Boris},
  journal={arXiv preprint arXiv:1708.03888},
  volume={6},
  year={2017}
}

@inproceedings{long_train,
  title={Train longer, generalize better: closing the generalization gap in large batch training of neural networks},
  author={Hoffer, Elad and Hubara, Itay and Soudry, Daniel},
  booktitle={Advances in Neural Information Processing Systems},
  pages={1731--1741},
  year={2017}
}

@article{dynamic_sgd,
  title={Dynamic mini-batch SGD for elastic distributed training: learning in the limbo of resources},
  author={Lin, Haibin and Zhang, Hang and Ma, Yifei and He, Tong and Zhang, Zhi and Zha, Sheng and Li, Mu},
  journal={arXiv preprint arXiv:1904.12043},
  year={2019}
}

@article{mom_sgd,
  title={A stochastic approximation method},
  author={Robbins, Herbert and Monro, Sutton},
  journal={The annals of mathematical statistics},
  pages={400--407},
  year={1951},
  publisher={JSTOR}
}

@inproceedings{pytorch,
  title={Pytorch: An imperative style, high-performance deep learning library},
  author={Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and others},
  booktitle={Advances in neural information processing systems},
  pages={8026--8037},
  year={2019}
}


@inproceedings{int8,
  title={Quantization and training of neural networks for efficient integer-arithmetic-only inference},
  author={Jacob, Benoit and Kligys, Skirmantas and Chen, Bo and Zhu, Menglong and Tang, Matthew and Howard, Andrew and Adam, Hartwig and Kalenichenko, Dmitry},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={2704--2713},
  year={2018}
}


@article{adabatch,
  title={Adabatch: Adaptive batch sizes for training deep neural networks},
  author={Devarakonda, Aditya and Naumov, Maxim and Garland, Michael},
  journal={arXiv preprint arXiv:1712.02029},
  year={2017}
}

@article{dondecay,
  title={Don't decay the learning rate, increase the batch size},
  author={Smith, Samuel L and Kindermans, Pieter-Jan and Ying, Chris and Le, Quoc V},
  journal={arXiv preprint arXiv:1711.00489},
  year={2017}
}

@article{empirical,
  title={An empirical model of large-batch training},
  author={McCandlish, Sam and Kaplan, Jared and Amodei, Dario and Team, OpenAI Dota},
  journal={arXiv preprint arXiv:1812.06162},
  year={2018}
}

@book{momenmtum,
  title={ON A STOCHASTIC APPROXIMATION METHOD},
  author={Chung, K. L.},
}

@article{nv,
  title={https://www.nvidia.cn/design-visualization/nvlink-bridges/},
  journal={online},
}

@article{dma,
  title={https://developer.nvidia.com/gpudirect},
  journal={online},
}

@article{tensorrt,
  title={https://developer.nvidia.com/tensorrt},
  journal={online},
}

@article{don,
  title={Don't Use Large Mini-Batches, Use Local SGD},
  author={Lin, Tao  and  Stich, Sebastian U  and  Patel, Kumar Kshitij  and  Jaggi, Martin },
  year={2018},
  journal={online},
}

@article{don2,
  title={Don't Decay the Learning Rate, Increase the Batch Size},
  author={Smith, Samuel L and Kindermans, Pieter Jan and Ying, Chris and Le, Quoc V},
  year={2017},
  journal={online},
}

@article{don3,
  title={A Bayesian Perspective on Generalization and Stochastic Gradient Descent},
  author={ Smith, Samuel L  and  Le, Quoc V },
  year={2017},
  journal={online},
}



@article{async0,
  title={Large Scale Distributed Deep Networks},
  author={ Dean, Jeffrey  and  Corrado, Greg S  and  Monga, Rajat  and  Chen, Kai  and  Ng, Andrew Y },
  journal={Advances in neural information processing systems},
  year={2013},
}

@inproceedings{DBLP:slide,
  author    = {Beidi Chen and
               Tharun Medini and
               James Farwell and
               Sameh Gobriel and
               Charlie Tai and
               Anshumali Shrivastava},
  title     = {{SLIDE} : In Defense of Smart Algorithms over Hardware Acceleration
               for Large-Scale Deep Learning Systems},
  booktitle = {MLSys},
  pages     ={1--16},
  year      = {2020},
 }

@inproceedings{2018DeepCPU,
  title={DeepCPU: Serving RNN-based Deep Learning Models 10x Faster},
  author={ Zhang, Minjia  and  Rajbhandari, Samyam  and  Wang, Wenhan  and  He, Yuxiong },
  booktitle={USENIX Annual Technical Conference},
  year={2018},
}

@inproceedings{DD,
  title={DD-L1D: Improving the Decoupled L1D Efficiency for GPU Architecture},
  author={ Yang, Weiguang  and  Wang, Yuxin  and  Yu, Yulong  and  Kan, Guangyuan  and  Guo, He },
  booktitle={12th IEEE International Conference on Networking, Architecture, and Storage (NAS 2017)},
  year={2017},
}

